{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introductory examples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.usa.gov data from bit.ly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%pwd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "open(path).readline()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'\n",
    "records = [json.loads(line) for line in open(path)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "records[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "records[0]['tz']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(records[0]['tz'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Counting time zones in pure Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "time_zones = [rec['tz'] for rec in records]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "time_zones = [rec['tz'] for rec in records if 'tz' in rec]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "time_zones[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_counts(sequence):\n",
    "    counts = {}\n",
    "    for x in sequence:\n",
    "        if x in counts:\n",
    "            counts[x] += 1\n",
    "        else:\n",
    "            counts[x] = 1\n",
    "    return counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "def get_counts2(sequence):\n",
    "    counts = defaultdict(int) # values will initialize to 0\n",
    "    for x in sequence:\n",
    "        counts[x] += 1\n",
    "    return counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counts = get_counts(time_zones)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counts['America/New_York']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "len(time_zones)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def top_counts(count_dict, n=10):\n",
    "    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]\n",
    "    value_key_pairs.sort()\n",
    "    return value_key_pairs[-n:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "top_counts(counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counts = Counter(time_zones)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counts.most_common(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Counting time zones with pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "from numpy.random import randn\n",
    "import numpy as np\n",
    "import os\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "plt.rc('figure', figsize=(10, 6))\n",
    "np.set_printoptions(precision=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'\n",
    "lines = open(path).readlines()\n",
    "records = [json.loads(line) for line in lines]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from pandas import DataFrame, Series\n",
    "import pandas as pd\n",
    "\n",
    "frame = DataFrame(records)\n",
    "frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame['tz'][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tz_counts = frame['tz'].value_counts()\n",
    "tz_counts[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clean_tz = frame['tz'].fillna('Missing')\n",
    "clean_tz[clean_tz == ''] = 'Unknown'\n",
    "tz_counts = clean_tz.value_counts()\n",
    "tz_counts[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "tz_counts[:10].plot(kind='barh', rot=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame['a'][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame['a'][50]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame['a'][51]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "results = Series([x.split()[0] for x in frame.a.dropna()])\n",
    "results[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "results.value_counts()[:8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cframe = frame[frame.a.notnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "operating_system = np.where(cframe['a'].str.contains('Windows'),\n",
    "                            'Windows', 'Not Windows')\n",
    "operating_system[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "by_tz_os = cframe.groupby(['tz', operating_system])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "agg_counts = by_tz_os.size().unstack().fillna(0)\n",
    "agg_counts[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Use to sort in ascending order\n",
    "indexer = agg_counts.sum(1).argsort()\n",
    "indexer[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "count_subset = agg_counts.take(indexer)[-10:]\n",
    "count_subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "count_subset.plot(kind='barh', stacked=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "normed_subset = count_subset.div(count_subset.sum(1), axis=0)\n",
    "normed_subset.plot(kind='barh', stacked=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MovieLens 1M data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "encoding = 'latin1'\n",
    "\n",
    "upath = os.path.expanduser('ch02/movielens/users.dat')\n",
    "rpath = os.path.expanduser('ch02/movielens/ratings.dat')\n",
    "mpath = os.path.expanduser('ch02/movielens/movies.dat')\n",
    "\n",
    "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n",
    "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n",
    "mnames = ['movie_id', 'title', 'genres']\n",
    "\n",
    "users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)\n",
    "ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)\n",
    "movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "users[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ratings[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "movies[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = pd.merge(pd.merge(ratings, users), movies)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data.ix[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mean_ratings = data.pivot_table('rating', index='title',\n",
    "                                columns='gender', aggfunc='mean')\n",
    "mean_ratings[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ratings_by_title = data.groupby('title').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "ratings_by_title[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "active_titles = ratings_by_title.index[ratings_by_title >= 250]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "active_titles[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mean_ratings = mean_ratings.ix[active_titles]\n",
    "mean_ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':\n",
    "                           'Seven Samurai (Shichinin no samurai) (1954)'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)\n",
    "top_female_ratings[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Measuring rating disagreement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sorted_by_diff = mean_ratings.sort_index(by='diff')\n",
    "sorted_by_diff[:15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Reverse order of rows, take first 15 rows\n",
    "sorted_by_diff[::-1][:15]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Standard deviation of rating grouped by title\n",
    "rating_std_by_title = data.groupby('title')['rating'].std()\n",
    "# Filter down to active_titles\n",
    "rating_std_by_title = rating_std_by_title.ix[active_titles]\n",
    "# Order Series by value in descending order\n",
    "rating_std_by_title.order(ascending=False)[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### US Baby Names 1880-2010"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "from numpy.random import randn\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rc('figure', figsize=(12, 5))\n",
    "np.set_printoptions(precision=4)\n",
    "%pwd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "http://www.ssa.gov/oact/babynames/limits.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!head -n 10 ch02/names/yob1880.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])\n",
    "names1880"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "names1880.groupby('sex').births.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 2010 is the last available year right now\n",
    "years = range(1880, 2011)\n",
    "\n",
    "pieces = []\n",
    "columns = ['name', 'sex', 'births']\n",
    "\n",
    "for year in years:\n",
    "    path = 'ch02/names/yob%d.txt' % year\n",
    "    frame = pd.read_csv(path, names=columns)\n",
    "\n",
    "    frame['year'] = year\n",
    "    pieces.append(frame)\n",
    "\n",
    "# Concatenate everything into a single DataFrame\n",
    "names = pd.concat(pieces, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_births = names.pivot_table('births', index='year',\n",
    "                                 columns='sex', aggfunc=sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_births.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_births.plot(title='Total births by sex and year')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def add_prop(group):\n",
    "    # Integer division floors\n",
    "    births = group.births.astype(float)\n",
    "\n",
    "    group['prop'] = births / births.sum()\n",
    "    return group\n",
    "names = names.groupby(['year', 'sex']).apply(add_prop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_top1000(group):\n",
    "    return group.sort_index(by='births', ascending=False)[:1000]\n",
    "grouped = names.groupby(['year', 'sex'])\n",
    "top1000 = grouped.apply(get_top1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pieces = []\n",
    "for year, group in names.groupby(['year', 'sex']):\n",
    "    pieces.append(group.sort_index(by='births', ascending=False)[:1000])\n",
    "top1000 = pd.concat(pieces, ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "top1000.index = np.arange(len(top1000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "top1000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Analyzing naming trends"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "boys = top1000[top1000.sex == 'M']\n",
    "girls = top1000[top1000.sex == 'F']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_births = top1000.pivot_table('births', index='year', columns='name',\n",
    "                                   aggfunc=sum)\n",
    "total_births"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]\n",
    "subset.plot(subplots=True, figsize=(12, 10), grid=False,\n",
    "            title=\"Number of births per year\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Measuring the increase in naming diversity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.figure()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "table = top1000.pivot_table('prop', index='year',\n",
    "                            columns='sex', aggfunc=sum)\n",
    "table.plot(title='Sum of table1000.prop by year and sex',\n",
    "           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = boys[boys.year == 2010]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()\n",
    "prop_cumsum[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "prop_cumsum.values.searchsorted(0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df = boys[boys.year == 1900]\n",
    "in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()\n",
    "in1900.values.searchsorted(0.5) + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_quantile_count(group, q=0.5):\n",
    "    group = group.sort_index(by='prop', ascending=False)\n",
    "    return group.prop.cumsum().values.searchsorted(q) + 1\n",
    "\n",
    "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n",
    "diversity = diversity.unstack('sex')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_quantile_count(group, q=0.5):\n",
    "    group = group.sort_index(by='prop', ascending=False)\n",
    "    return group.prop.cumsum().values.searchsorted(q) + 1\n",
    "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n",
    "diversity = diversity.unstack('sex')\n",
    "diversity.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "diversity.plot(title=\"Number of popular names in top 50%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### The \"Last letter\" Revolution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# extract last letter from name column\n",
    "get_last_letter = lambda x: x[-1]\n",
    "last_letters = names.name.map(get_last_letter)\n",
    "last_letters.name = 'last_letter'\n",
    "\n",
    "table = names.pivot_table('births', index=last_letters,\n",
    "                          columns=['sex', 'year'], aggfunc=sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "subtable = table.reindex(columns=[1910, 1960, 2010], level='year')\n",
    "subtable.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "subtable.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "letter_prop = subtable / subtable.sum().astype(float)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "fig, axes = plt.subplots(2, 1, figsize=(10, 8))\n",
    "letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')\n",
    "letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',\n",
    "                      legend=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.subplots_adjust(hspace=0.25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "letter_prop = table / table.sum().astype(float)\n",
    "\n",
    "dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T\n",
    "dny_ts.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.close('all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dny_ts.plot()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Boy names that became girl names (and vice versa)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_names = top1000.name.unique()\n",
    "mask = np.array(['lesl' in x.lower() for x in all_names])\n",
    "lesley_like = all_names[mask]\n",
    "lesley_like"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "filtered = top1000[top1000.name.isin(lesley_like)]\n",
    "filtered.groupby('name').births.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "table = filtered.pivot_table('births', index='year',\n",
    "                             columns='sex', aggfunc='sum')\n",
    "table = table.div(table.sum(1), axis=0)\n",
    "table.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.close('all')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "table.plot(style={'M': 'k-', 'F': 'k--'})"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
